#Importación de librerías
import os
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from pyzipcode import ZipCodeDatabase
from uszipcode import SearchEngine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error, mean_absolute_error
#Se establece como directorio de ejecición la ubicación local de este Notebook
file_path = os.path.dirname(os.path.abspath("__file__"))
os.chdir(file_path)
#Cargue del archivo
datos_ventas=pd.read_csv("house_sales.csv")
#Dimensiones de los datos
datos_ventas.shape
(18448, 16)
Tenemos 18.448 observaciones y 16 columnas. Los datos están en formatos númerico enteros o flotantes.
#Revisión del tipo de datos de cada columna
datos_ventas.dtypes
price int64 num_bed int64 num_bath float64 size_house int64 size_lot int64 num_floors float64 is_waterfront int64 condition int64 size_basement int64 year_built int64 renovation_date int64 zip int64 latitude float64 longitude float64 avg_size_neighbor_houses int64 avg_size_neighbor_lot int64 dtype: object
Aparentemente no hay datos faltantes.
datos_ventas.isna().sum()
price 0 num_bed 0 num_bath 0 size_house 0 size_lot 0 num_floors 0 is_waterfront 0 condition 0 size_basement 0 year_built 0 renovation_date 0 zip 0 latitude 0 longitude 0 avg_size_neighbor_houses 0 avg_size_neighbor_lot 0 dtype: int64
La distribución de la mayoría de variables hace sentido, pero se evidencian observaciones en las que el número de habitaciones y el número de baños es de cero. Lo más probable es que esos sean en realidad datos faltantes.
datos_ventas.describe(include=[np.number])
| price | num_bed | num_bath | size_house | size_lot | num_floors | is_waterfront | condition | size_basement | year_built | renovation_date | zip | latitude | longitude | avg_size_neighbor_houses | avg_size_neighbor_lot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.844800e+04 | 18448.000000 | 18448.000000 | 18448.000000 | 1.844800e+04 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 |
| mean | 5.423624e+05 | 3.372615 | 2.118888 | 2083.940915 | 1.503602e+04 | 1.494606 | 0.007643 | 3.411698 | 293.571498 | 1971.001138 | 85.145002 | 98077.921455 | 47.560030 | -122.214419 | 1988.306483 | 12571.596216 |
| std | 3.720135e+05 | 0.933892 | 0.772384 | 921.416218 | 4.181455e+04 | 0.540806 | 0.087092 | 0.652593 | 443.607503 | 29.361619 | 403.371263 | 53.497440 | 0.138557 | 0.139910 | 686.173124 | 26329.260211 |
| min | 7.800000e+04 | 0.000000 | 0.000000 | 290.000000 | 5.200000e+02 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 1900.000000 | 0.000000 | 98001.000000 | 47.155933 | -122.518648 | 399.000000 | 651.000000 |
| 25% | 3.218375e+05 | 3.000000 | 1.750000 | 1430.000000 | 5.050000e+03 | 1.000000 | 0.000000 | 3.000000 | 0.000000 | 1952.000000 | 0.000000 | 98033.000000 | 47.471527 | -122.328084 | 1490.000000 | 5100.000000 |
| 50% | 4.500000e+05 | 3.000000 | 2.250000 | 1920.000000 | 7.600500e+03 | 1.500000 | 0.000000 | 3.000000 | 0.000000 | 1975.000000 | 0.000000 | 98065.000000 | 47.571599 | -122.230688 | 1840.000000 | 7611.000000 |
| 75% | 6.480000e+05 | 4.000000 | 2.500000 | 2560.000000 | 1.062525e+04 | 2.000000 | 0.000000 | 4.000000 | 570.000000 | 1997.000000 | 0.000000 | 98118.000000 | 47.677918 | -122.125733 | 2370.000000 | 10050.000000 |
| max | 7.700000e+06 | 33.000000 | 8.000000 | 13540.000000 | 1.651359e+06 | 3.500000 | 1.000000 | 5.000000 | 4820.000000 | 2015.000000 | 2015.000000 | 98199.000000 | 47.777624 | -121.315254 | 6110.000000 | 858132.000000 |
Teniendo en cuenta las características de las viviendas con cero habitaciones o cero baños según los datos originales, se confirma que son datos faltantes. Por ejemplo, no es probable que una vivienda de 3.5 pisos y 3064 Ft2 (284 M2) no tenga habitaciones ni baños. Teniendo en cuenta que las observaciones con estos datos faltantes son solo 12 en unos datos con 18.448 observaciones, la eliminación de estas no tendrá mayor impacto en el desempeño predictivo de los modelos. Así pues, se toma la decisión de eliminarlas.
datos_ventas.loc[(datos_ventas['num_bed']==0)|(datos_ventas['num_bath']==0), :]
| price | num_bed | num_bath | size_house | size_lot | num_floors | is_waterfront | condition | size_basement | year_built | renovation_date | zip | latitude | longitude | avg_size_neighbor_houses | avg_size_neighbor_lot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 744 | 1095000 | 0 | 0.00 | 3064 | 4764 | 3.5 | 0 | 3 | 0 | 1990 | 0 | 98102 | 47.636246 | -122.321563 | 2360 | 4000 |
| 2641 | 380000 | 0 | 0.00 | 1470 | 979 | 3.0 | 0 | 3 | 0 | 2006 | 0 | 98133 | 47.714460 | -122.356067 | 1470 | 1399 |
| 2942 | 288000 | 0 | 1.50 | 1430 | 1650 | 3.0 | 0 | 3 | 0 | 1999 | 0 | 98125 | 47.722172 | -122.290286 | 1430 | 1650 |
| 4160 | 228000 | 0 | 1.00 | 390 | 5900 | 1.0 | 0 | 2 | 0 | 1953 | 0 | 98118 | 47.526040 | -122.261427 | 2170 | 6000 |
| 5964 | 1295648 | 0 | 0.00 | 4810 | 28008 | 2.0 | 0 | 3 | 0 | 1990 | 0 | 98053 | 47.664215 | -122.069243 | 4740 | 35061 |
| 7221 | 339950 | 0 | 2.50 | 2290 | 8319 | 2.0 | 0 | 3 | 0 | 1985 | 0 | 98042 | 47.347267 | -122.151189 | 2500 | 8751 |
| 7228 | 240000 | 0 | 2.50 | 1810 | 5669 | 2.0 | 0 | 3 | 0 | 2003 | 0 | 98038 | 47.349347 | -122.052628 | 1810 | 5685 |
| 8329 | 355000 | 0 | 0.00 | 2460 | 8049 | 2.0 | 0 | 3 | 0 | 1990 | 0 | 98031 | 47.409511 | -122.168250 | 2520 | 8050 |
| 8400 | 235000 | 0 | 0.00 | 1470 | 4800 | 2.0 | 0 | 3 | 0 | 1996 | 0 | 98065 | 47.526491 | -121.828222 | 1060 | 7200 |
| 12266 | 139950 | 0 | 0.00 | 844 | 4269 | 1.0 | 0 | 4 | 0 | 1913 | 0 | 98001 | 47.278148 | -122.249697 | 1380 | 9600 |
| 15691 | 265000 | 0 | 0.75 | 384 | 213444 | 1.0 | 0 | 3 | 0 | 2003 | 0 | 98070 | 47.417727 | -122.491217 | 1920 | 224341 |
| 16612 | 142000 | 0 | 0.00 | 290 | 20875 | 1.0 | 0 | 1 | 0 | 1963 | 0 | 98024 | 47.530772 | -121.888423 | 1620 | 22850 |
datos_ventas=datos_ventas.loc[(datos_ventas['num_bed']!=0)&(datos_ventas['num_bath']!=0), :]
#Se utiliza la librería Pandas Profiling para ampliar el EDA
prof = ProfileReport(datos_ventas)
prof
Summarize dataset: 100%|██████████| 255/255 [00:46<00:00, 5.54it/s, Completed] Generate report structure: 100%|██████████| 1/1 [00:07<00:00, 7.63s/it] Render HTML: 100%|██████████| 1/1 [00:08<00:00, 8.50s/it]